import pandas as pd
import sqlalchemy as sa
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = "seaborn"
pio.renderers.default = "vscode+notebook"
pd.options.display.max_columns = None
# print current renderer
conn = "postgresql://postgres:pass@localhost/hr_database"
Don't need to do this if data was already imported some other way.
# employee_df = pd.read_csv("Datasets/employee_survey_data.csv", index_col=0)
# employee_df.to_sql("employee_survey_data", conn, if_exists="replace")
# general_df = pd.read_csv("Datasets/general_data.csv", index_col=8)
# general_df.to_sql("general_data", conn, if_exists="replace")
# manager_df = pd.read_csv("Datasets/manager_survey_data.csv", index_col=0)
# manager_df.to_sql("manager_survey_data", conn, if_exists="replace")
employee_df = pd.read_sql_table("employee_survey_data", conn)
general_df = pd.read_sql_table("general_data", conn)
manager_df = pd.read_sql_table("manager_survey_data", conn)
#######################################
# Alternatively import from csv files #
#######################################
# employee_df = pd.read_csv("Datasets/employee_survey_data.csv", index_col=0)
# general_df = pd.read_csv("Datasets/general_data.csv", index_col=8)
# manager_df = pd.read_csv("Datasets/manager_survey_data.csv", index_col=0)
display(employee_df.head())
display(general_df.head())
display(manager_df.head())
| EmployeeID | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | |
|---|---|---|---|---|
| 0 | 1 | 3.0 | 4.0 | 2.0 |
| 1 | 2 | 3.0 | 2.0 | 4.0 |
| 2 | 3 | 2.0 | 2.0 | 1.0 |
| 3 | 4 | 4.0 | 4.0 | 3.0 |
| 4 | 5 | 4.0 | 1.0 | 3.0 |
| EmployeeID | Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | EmployeeCount | Gender | JobLevel | JobRole | MaritalStatus | MonthlyIncome | NumCompaniesWorked | Over18 | PercentSalaryHike | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | No | Travel_Rarely | Sales | 6 | 2 | Life Sciences | 1 | Female | 1 | Healthcare Representative | Married | 131160 | 1.0 | Y | 11 | 8 | 0 | 1.0 | 6 | 1 | 0 | 0 |
| 1 | 2 | 31 | Yes | Travel_Frequently | Research & Development | 10 | 1 | Life Sciences | 1 | Female | 1 | Research Scientist | Single | 41890 | 0.0 | Y | 23 | 8 | 1 | 6.0 | 3 | 5 | 1 | 4 |
| 2 | 3 | 32 | No | Travel_Frequently | Research & Development | 17 | 4 | Other | 1 | Male | 4 | Sales Executive | Married | 193280 | 1.0 | Y | 15 | 8 | 3 | 5.0 | 2 | 5 | 0 | 3 |
| 3 | 4 | 38 | No | Non-Travel | Research & Development | 2 | 5 | Life Sciences | 1 | Male | 3 | Human Resources | Married | 83210 | 3.0 | Y | 11 | 8 | 3 | 13.0 | 5 | 8 | 7 | 5 |
| 4 | 5 | 32 | No | Travel_Rarely | Research & Development | 10 | 1 | Medical | 1 | Male | 1 | Sales Executive | Single | 23420 | 4.0 | Y | 12 | 8 | 2 | 9.0 | 2 | 6 | 0 | 4 |
| EmployeeID | JobInvolvement | PerformanceRating | |
|---|---|---|---|
| 0 | 1 | 3 | 3 |
| 1 | 2 | 2 | 4 |
| 2 | 3 | 3 | 3 |
| 3 | 4 | 2 | 3 |
| 4 | 5 | 3 | 3 |
data_df = general_df.merge(employee_df, on="EmployeeID", how="left")
data_df = data_df.merge(manager_df, on="EmployeeID", how="left")
in_time_df = pd.read_csv("Datasets/in_time.csv")
out_time_df = pd.read_csv("Datasets/out_time.csv")
# rename first column
in_time_df.rename(columns={"Unnamed: 0": "EmployeeID"}, inplace=True)
# change index to EmployeeID
in_time_df.set_index("EmployeeID", inplace=True)
# change dtype to datetime
in_time_df = in_time_df.apply(pd.to_datetime)
# repeat for out_time_df
out_time_df.rename(columns={"Unnamed: 0": "EmployeeID"}, inplace=True)
out_time_df.set_index("EmployeeID", inplace=True)
out_time_df = out_time_df.apply(pd.to_datetime)
# create df with in_time and out_time difference
time_diff_df = out_time_df - in_time_df
# calculate average time ignoring NaN values
time_diff_df["AverageTime"] = time_diff_df.mean(axis=1, skipna=True)
# calculate total time ignoring NaN values
time_diff_df["TotalTime"] = time_diff_df.sum(axis=1, skipna=True)
# drop all columns except Average and Total
time_diff_df = time_diff_df[["AverageTime", "TotalTime"]]
# convert to seconds
time_diff_df = time_diff_df.apply(lambda x: x / np.timedelta64(1, "s"))
time_diff_df = time_diff_df.reset_index()
time_diff_df.head()
| EmployeeID | AverageTime | TotalTime | |
|---|---|---|---|
| 0 | 1 | 26545.142241 | 6.185018e+06 |
| 1 | 2 | 27788.288136 | 6.585824e+06 |
| 2 | 3 | 25247.665289 | 6.135183e+06 |
| 3 | 4 | 25897.242553 | 6.111749e+06 |
| 4 | 5 | 28822.228571 | 7.090268e+06 |
# join time_diff_df with data_df
data_df = data_df.merge(time_diff_df, on="EmployeeID", how="left")
data_df.head()
| EmployeeID | Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | EmployeeCount | Gender | JobLevel | JobRole | MaritalStatus | MonthlyIncome | NumCompaniesWorked | Over18 | PercentSalaryHike | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | JobInvolvement | PerformanceRating | AverageTime | TotalTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | No | Travel_Rarely | Sales | 6 | 2 | Life Sciences | 1 | Female | 1 | Healthcare Representative | Married | 131160 | 1.0 | Y | 11 | 8 | 0 | 1.0 | 6 | 1 | 0 | 0 | 3.0 | 4.0 | 2.0 | 3 | 3 | 26545.142241 | 6.185018e+06 |
| 1 | 2 | 31 | Yes | Travel_Frequently | Research & Development | 10 | 1 | Life Sciences | 1 | Female | 1 | Research Scientist | Single | 41890 | 0.0 | Y | 23 | 8 | 1 | 6.0 | 3 | 5 | 1 | 4 | 3.0 | 2.0 | 4.0 | 2 | 4 | 27788.288136 | 6.585824e+06 |
| 2 | 3 | 32 | No | Travel_Frequently | Research & Development | 17 | 4 | Other | 1 | Male | 4 | Sales Executive | Married | 193280 | 1.0 | Y | 15 | 8 | 3 | 5.0 | 2 | 5 | 0 | 3 | 2.0 | 2.0 | 1.0 | 3 | 3 | 25247.665289 | 6.135183e+06 |
| 3 | 4 | 38 | No | Non-Travel | Research & Development | 2 | 5 | Life Sciences | 1 | Male | 3 | Human Resources | Married | 83210 | 3.0 | Y | 11 | 8 | 3 | 13.0 | 5 | 8 | 7 | 5 | 4.0 | 4.0 | 3.0 | 2 | 3 | 25897.242553 | 6.111749e+06 |
| 4 | 5 | 32 | No | Travel_Rarely | Research & Development | 10 | 1 | Medical | 1 | Male | 1 | Sales Executive | Single | 23420 | 4.0 | Y | 12 | 8 | 2 | 9.0 | 2 | 6 | 0 | 4 | 4.0 | 1.0 | 3.0 | 3 | 3 | 28822.228571 | 7.090268e+06 |
# change column dtype to str
data_df.columns = data_df.columns.astype(str)
We will check for any null values, and decide if we want to drop them or impute them.
num_null_rows = np.count_nonzero(data_df.isna())
print(f"Number of rows with null values: {num_null_rows}")
print(f"Percent of rows with null values: {num_null_rows/data_df.shape[0]*100:.3f}%")
Number of rows with null values: 111 Percent of rows with null values: 2.517%
There are not very many null values so we can safely drop them without affecting the final analysis.
# drop rows with missing values
data_df_no_na = data_df.dropna()
print(f"Duplicate Rows: {data_df_no_na.duplicated().sum()}")
Duplicate Rows: 0
There are no duplicate rows in the dataset.
The variables in the employee and manager survey are pretty self explanatory.
ignored_columns = [
"EmployeeID",
]
numerical_columns = [
# "EmployeeID",
"Age",
# "Attrition",
# "BusinessTravel",
# "Department",
"DistanceFromHome",
# "Education",
# "EducationField",
"EmployeeCount",
# "Gender",
# "JobLevel",
# "JobRole",
# "MaritalStatus",
"MonthlyIncome",
"NumCompaniesWorked",
# "Over18",
"PercentSalaryHike",
"StandardHours",
"StockOptionLevel",
"TotalWorkingYears",
"TrainingTimesLastYear",
"YearsAtCompany",
"YearsSinceLastPromotion",
"YearsWithCurrManager",
# 'EnvironmentSatisfaction',
# 'JobSatisfaction',
# 'WorkLifeBalance',
# 'JobInvolvement',
# 'PerformanceRating'
"AverageTime",
"TotalTime",
]
categorical_columns = [
var
for var in data_df_no_na.columns
if var not in numerical_columns and var not in ignored_columns
]
# # make categorical columns into category type
# for col in categorical_columns:
# data_df[col] = data_df[col].astype("category")
Create functions to visualise numerical data and categorical data
def numerical_vis(data, variable):
from plotly.subplots import make_subplots
hist_fig = px.histogram(data, x=variable, nbins=20)
box_fig = px.box(data, y=variable)
final_fig = make_subplots(rows=1, cols=2)
final_fig.add_trace(hist_fig.data[0], row=1, col=1)
final_fig.add_trace(box_fig.data[0], row=1, col=2)
# title
final_fig.update_layout(title_text=f"Histogram and Boxplot of {variable}")
final_fig.show()
# fig, ax = plt.subplots(1, 2, figsize=(12, 6))
# # hist plot
# sns.histplot(x=data[variable], ax=ax[0], kde=True)
# # box plot
# sns.boxplot(x=data[variable], ax=ax[1])
# # titles
# ax[0].set_title(f"Histogram of {variable}")
# ax[1].set_title(f"Boxplot of {variable}")
# plt.show()
def categorical_vis(data, variable):
# fig, ax = plt.subplots(1, 2, figsize=(12, 6))
# # pie plot
# data[variable].value_counts().plot(kind="pie", ax=ax[0], autopct="%1.1f%%")
# # bar chart
# sns.countplot(x=data[variable], ax=ax[1])
# # titles
# ax[0].set_title(f"Pie chart of {variable}")
# ax[1].set_title(f"Bar chart of {variable}")
pie_fig = px.pie(data, names=variable)
bar_fig = px.histogram(data, x=variable, color=variable)
final_fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "xy"}]])
final_fig.add_trace(pie_fig.data[0], row=1, col=1)
for index in range(len(bar_fig.data)):
final_fig.add_trace(bar_fig.data[index], row=1, col=2)
# barmode
final_fig.update_layout(barmode="stack")
# bargap
final_fig.update_layout(bargap=0.1)
# title
final_fig.update_layout(title_text=f"Distribution of {variable}")
# center title
final_fig.update_layout(title_x=0.5)
# hide legend
final_fig.update_layout(showlegend=False)
# title font size
final_fig.update_layout(title_font_size=20)
final_fig.show()
for var in numerical_columns:
numerical_vis(data_df_no_na, var)
From the plots we can see a quick overview of the distribution of the variables.
data_df_no_na = data_df_no_na.drop(columns=["EmployeeCount", "StandardHours"])
# remove columns from numerical columns
numerical_columns.remove("EmployeeCount")
numerical_columns.remove("StandardHours")
data_df_no_na.head()
| EmployeeID | Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | Gender | JobLevel | JobRole | MaritalStatus | MonthlyIncome | NumCompaniesWorked | Over18 | PercentSalaryHike | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | JobInvolvement | PerformanceRating | AverageTime | TotalTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | No | Travel_Rarely | Sales | 6 | 2 | Life Sciences | Female | 1 | Healthcare Representative | Married | 131160 | 1.0 | Y | 11 | 0 | 1.0 | 6 | 1 | 0 | 0 | 3.0 | 4.0 | 2.0 | 3 | 3 | 26545.142241 | 6.185018e+06 |
| 1 | 2 | 31 | Yes | Travel_Frequently | Research & Development | 10 | 1 | Life Sciences | Female | 1 | Research Scientist | Single | 41890 | 0.0 | Y | 23 | 1 | 6.0 | 3 | 5 | 1 | 4 | 3.0 | 2.0 | 4.0 | 2 | 4 | 27788.288136 | 6.585824e+06 |
| 2 | 3 | 32 | No | Travel_Frequently | Research & Development | 17 | 4 | Other | Male | 4 | Sales Executive | Married | 193280 | 1.0 | Y | 15 | 3 | 5.0 | 2 | 5 | 0 | 3 | 2.0 | 2.0 | 1.0 | 3 | 3 | 25247.665289 | 6.135183e+06 |
| 3 | 4 | 38 | No | Non-Travel | Research & Development | 2 | 5 | Life Sciences | Male | 3 | Human Resources | Married | 83210 | 3.0 | Y | 11 | 3 | 13.0 | 5 | 8 | 7 | 5 | 4.0 | 4.0 | 3.0 | 2 | 3 | 25897.242553 | 6.111749e+06 |
| 4 | 5 | 32 | No | Travel_Rarely | Research & Development | 10 | 1 | Medical | Male | 1 | Sales Executive | Single | 23420 | 4.0 | Y | 12 | 2 | 9.0 | 2 | 6 | 0 | 4 | 4.0 | 1.0 | 3.0 | 3 | 3 | 28822.228571 | 7.090268e+06 |
for var in categorical_columns:
categorical_vis(data_df_no_na, var)
The distribution of the categorical variables can be seen above.
data_df_no_na = data_df_no_na.drop(columns=["Over18"])
# remove columns from categorical columns
categorical_columns.remove("Over18")
We will change binary categorical variables to 1, 0 and ordered categorical data to numbers
# ordered categorical data to numerical
data_df_no_na["Attrition"] = data_df_no_na["Attrition"].map({"Yes": 1, "No": 0})
# data_df["Over18"] = data_df["Over18"].map({"Y": 1, "N": 0})
data_df_no_na["BusinessTravel"] = data_df_no_na["BusinessTravel"].map(
{"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
)
data_df_no_na["BusinessTravel"].value_counts()
data_df_no_na.head()
| EmployeeID | Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | Gender | JobLevel | JobRole | MaritalStatus | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | JobInvolvement | PerformanceRating | AverageTime | TotalTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | 0 | 1 | Sales | 6 | 2 | Life Sciences | Female | 1 | Healthcare Representative | Married | 131160 | 1.0 | 11 | 0 | 1.0 | 6 | 1 | 0 | 0 | 3.0 | 4.0 | 2.0 | 3 | 3 | 26545.142241 | 6.185018e+06 |
| 1 | 2 | 31 | 1 | 2 | Research & Development | 10 | 1 | Life Sciences | Female | 1 | Research Scientist | Single | 41890 | 0.0 | 23 | 1 | 6.0 | 3 | 5 | 1 | 4 | 3.0 | 2.0 | 4.0 | 2 | 4 | 27788.288136 | 6.585824e+06 |
| 2 | 3 | 32 | 0 | 2 | Research & Development | 17 | 4 | Other | Male | 4 | Sales Executive | Married | 193280 | 1.0 | 15 | 3 | 5.0 | 2 | 5 | 0 | 3 | 2.0 | 2.0 | 1.0 | 3 | 3 | 25247.665289 | 6.135183e+06 |
| 3 | 4 | 38 | 0 | 0 | Research & Development | 2 | 5 | Life Sciences | Male | 3 | Human Resources | Married | 83210 | 3.0 | 11 | 3 | 13.0 | 5 | 8 | 7 | 5 | 4.0 | 4.0 | 3.0 | 2 | 3 | 25897.242553 | 6.111749e+06 |
| 4 | 5 | 32 | 0 | 1 | Research & Development | 10 | 1 | Medical | Male | 1 | Sales Executive | Single | 23420 | 4.0 | 12 | 2 | 9.0 | 2 | 6 | 0 | 4 | 4.0 | 1.0 | 3.0 | 3 | 3 | 28822.228571 | 7.090268e+06 |
# check correlation
corr = data_df_no_na.corr()
px.imshow(
corr,
height=1200,
width=1200,
# min value of the color scale -1
zmin=-1,
# max value of the color scale 1
zmax=1,
# title
title="Correlation Matrix",
# color scale
color_continuous_scale=px.colors.diverging.RdYlGn,
)
/tmp/ipykernel_473790/249899736.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
def compare_categorical_vis(data, variable):
fig = px.histogram(
data,
x=variable,
color="Attrition",
barmode="overlay",
title=f"Histogram of {variable} by Attrition",
)
# bargap
fig.update_layout(bargap=0.1)
fig.show()
def compare_numerical_vis(data, variable):
hist_fig = px.histogram(data, x=variable, color="Attrition")
box_fig = px.box(data, x="Attrition", y=variable, color="Attrition")
final_fig = make_subplots(rows=1, cols=2)
final_fig.add_trace(hist_fig.data[0], row=1, col=1)
final_fig.add_trace(hist_fig.data[1], row=1, col=1)
final_fig.add_trace(box_fig.data[0], row=1, col=2)
final_fig.add_trace(box_fig.data[1], row=1, col=2)
# hide legend
# final_fig.update_layout(showlegend=False)
# title
final_fig.update_layout(title_text=f"Attrition by {variable}")
# title center
final_fig.update_layout(title_x=0.5)
# barmode
final_fig.update_layout(barmode="overlay")
final_fig.show()
for var in numerical_columns:
compare_numerical_vis(data_df_no_na, var)
for var in categorical_columns:
compare_categorical_vis(data_df_no_na, var)
# quick look at the data
data_df_no_na.head()
| EmployeeID | Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | Gender | JobLevel | JobRole | MaritalStatus | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | JobInvolvement | PerformanceRating | AverageTime | TotalTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 51 | 0 | 1 | Sales | 6 | 2 | Life Sciences | Female | 1 | Healthcare Representative | Married | 131160 | 1.0 | 11 | 0 | 1.0 | 6 | 1 | 0 | 0 | 3.0 | 4.0 | 2.0 | 3 | 3 | 26545.142241 | 6.185018e+06 |
| 1 | 2 | 31 | 1 | 2 | Research & Development | 10 | 1 | Life Sciences | Female | 1 | Research Scientist | Single | 41890 | 0.0 | 23 | 1 | 6.0 | 3 | 5 | 1 | 4 | 3.0 | 2.0 | 4.0 | 2 | 4 | 27788.288136 | 6.585824e+06 |
| 2 | 3 | 32 | 0 | 2 | Research & Development | 17 | 4 | Other | Male | 4 | Sales Executive | Married | 193280 | 1.0 | 15 | 3 | 5.0 | 2 | 5 | 0 | 3 | 2.0 | 2.0 | 1.0 | 3 | 3 | 25247.665289 | 6.135183e+06 |
| 3 | 4 | 38 | 0 | 0 | Research & Development | 2 | 5 | Life Sciences | Male | 3 | Human Resources | Married | 83210 | 3.0 | 11 | 3 | 13.0 | 5 | 8 | 7 | 5 | 4.0 | 4.0 | 3.0 | 2 | 3 | 25897.242553 | 6.111749e+06 |
| 4 | 5 | 32 | 0 | 1 | Research & Development | 10 | 1 | Medical | Male | 1 | Sales Executive | Single | 23420 | 4.0 | 12 | 2 | 9.0 | 2 | 6 | 0 | 4 | 4.0 | 1.0 | 3.0 | 3 | 3 | 28822.228571 | 7.090268e+06 |
Employee ID is something that doesnt affect attrition, so we will drop it.
# drop employee id
data_df_no_na = data_df_no_na.drop(columns=["EmployeeID"])
Next we will encode the categorical variables.
data_df_numerical = pd.get_dummies(data_df_no_na)
data_df_numerical.head()
| Age | Attrition | BusinessTravel | DistanceFromHome | Education | JobLevel | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | EnvironmentSatisfaction | JobSatisfaction | WorkLifeBalance | JobInvolvement | PerformanceRating | AverageTime | TotalTime | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 51 | 0 | 1 | 6 | 2 | 1 | 131160 | 1.0 | 11 | 0 | 1.0 | 6 | 1 | 0 | 0 | 3.0 | 4.0 | 2.0 | 3 | 3 | 26545.142241 | 6.185018e+06 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 31 | 1 | 2 | 10 | 1 | 1 | 41890 | 0.0 | 23 | 1 | 6.0 | 3 | 5 | 1 | 4 | 3.0 | 2.0 | 4.0 | 2 | 4 | 27788.288136 | 6.585824e+06 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 2 | 32 | 0 | 2 | 17 | 4 | 4 | 193280 | 1.0 | 15 | 3 | 5.0 | 2 | 5 | 0 | 3 | 2.0 | 2.0 | 1.0 | 3 | 3 | 25247.665289 | 6.135183e+06 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3 | 38 | 0 | 0 | 2 | 5 | 3 | 83210 | 3.0 | 11 | 3 | 13.0 | 5 | 8 | 7 | 5 | 4.0 | 4.0 | 3.0 | 2 | 3 | 25897.242553 | 6.111749e+06 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 32 | 0 | 1 | 10 | 1 | 1 | 23420 | 4.0 | 12 | 2 | 9.0 | 2 | 6 | 0 | 4 | 4.0 | 1.0 | 3.0 | 3 | 3 | 28822.228571 | 7.090268e+06 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
We will split the data into train and test sets.
from sklearn.model_selection import train_test_split
# get x and y from data
X = data_df_numerical.drop(columns=["Attrition"])
y = data_df_numerical["Attrition"]
# # moved this to the top
# # column names are currently sqlalchemy quoted_name
# # change to string
# X.columns = X.columns.astype(str)
# 80 20 split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
from sklearn.metrics import accuracy_score, confusion_matrix
def train_model(model_class, Xtrain, Xtest, Ytrain, Ytest, **kwargs):
model = model_class(**kwargs)
model.fit(Xtrain, Ytrain)
y_pred = model.predict(Xtest)
rounder = lambda x: 1 if x > 0.5 else 0
y_pred = np.array([rounder(x) for x in y_pred])
acc = accuracy_score(Ytest, y_pred)
cm = confusion_matrix(Ytest, y_pred)
fig = px.imshow(
cm,
height=500,
width=500,
labels={"color": "Count", "x": "Predicted", "y": "Actual"},
)
# title
fig.update_layout(
title_text=f"Confusion Matrix of {model_class.__name__}\nAccuracy: {acc:.3f}"
)
# title center
fig.update_layout(title_x=0.5)
# x axis label
fig.update_xaxes(title_text="Predicted")
# y axis label
fig.update_yaxes(title_text="Actual")
fig.show()
return model
from sklearn.linear_model import LogisticRegression, LinearRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
logistic_model = train_model(LogisticRegression, X_train, X_test, y_train, y_test)
linear_model = train_model(LinearRegression, X_train, X_test, y_train, y_test)
knn_model = train_model(
KNeighborsClassifier, X_train, X_test, y_train, y_test, n_neighbors=5
)
All the models have an accuracy of around 80% This seems good at first glance, but lets take a look at the predictions to see whats going on.
# logistic regression
y_pred = logistic_model.predict(X_test)
y_pred = [1 if x > 0.5 else 0 for x in y_pred]
print("LogisticRegression")
print(pd.Series(y_pred).value_counts())
# linear regression
y_pred = linear_model.predict(X_test)
y_pred = [1 if x > 0.5 else 0 for x in y_pred]
print("LinearRegression")
print(pd.Series(y_pred).value_counts())
# knn
y_pred = knn_model.predict(X_test)
print("KNeighborsClassifier")
print(pd.Series(y_pred).value_counts())
LogisticRegression 0 860 dtype: int64 LinearRegression 0 848 1 12 dtype: int64 KNeighborsClassifier 0 810 1 50 dtype: int64
As we can see, most of the models predict that the employee will not leave the company. And because we have around 80% people that do not leave the company, the model gets a decent accuracy score.
# xgboost classifier
from xgboost import XGBClassifier
xgb_model = train_model(XGBClassifier, X_train, X_test, y_train, y_test)
This classifier already has a much better accuracy score than the previous models. Let us calculate other metrics to see how it does.
from sklearn.metrics import (
accuracy_score,
precision_score,
classification_report,
confusion_matrix,
)
# print the accuracy, precision, sensitivity, and specificity
def print_metrics(y_test, y_pred):
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sens = tp / (tp + fn)
spec = tn / (tn + fp)
# define accuracy in binary classification
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Sensitivity: {sens:.3f}")
print(f"Specificity: {spec:.3f}")
print_metrics(y_test, xgb_model.predict(X_test))
Accuracy: 0.995 Precision: 1.000 Sensitivity: 0.971 Specificity: 1.000
Above,
Accuracy is the proportion of correct predictions
Precision is the proportion of positive identifications which were actually correct
Sensitivity is the proportion of actual positives which were identified correctly
Specificity is the proportion of actual negatives which were identified correctly
This model has good scores for all the relevant metrics.
We can now use this model to predict if an employee will leave the company or not. But the original dataset has a lot of variables, and we need all of these to predict if an employee will leave or not.
feature_importance_df = pd.DataFrame(
[xgb_model.feature_names_in_, xgb_model.feature_importances_]
).T
# rename columns
feature_importance_df.columns = ["feature", "importance"]
# sort by importance
feature_importance_df = feature_importance_df.sort_values(
by="importance", ascending=False
)
# plot bar
fig = px.bar(feature_importance_df, x="importance", y="feature", orientation="h")
fig.show()
So the most important features (top 10) are:
Lets read in the data again, apply all the preprocessing steps, and train a new model with the most important features.
selected_features = [
"EducationField",
"JobRole",
"MaritalStatus",
"TotalWorkingYears",
"EnvironmentSatisfaction",
"YearsSinceLastPromotion",
"BusinessTravel",
"Department",
]
data_df_small = data_df_no_na.copy()
# only keep selected features
data_df_small = data_df_small[selected_features + ["Attrition"]]
# get dummies
data_df_small = pd.get_dummies(data_df_small)
Xsmall = data_df_small.drop(columns=["Attrition"])
ysmall = data_df_small["Attrition"]
Xsmall_train, Xsmall_test, ysmall_train, ysmall_test = train_test_split(
Xsmall, ysmall, test_size=0.2, stratify=ysmall, random_state=42
)
xgb_small_model = train_model(
XGBClassifier, Xsmall_train, Xsmall_test, ysmall_train, ysmall_test
)
print_metrics(ysmall_test, xgb_small_model.predict(Xsmall_test))
Accuracy: 0.977 Precision: 0.961 Sensitivity: 0.892 Specificity: 0.993
The metrics for this model are still pretty good.
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
def create_encoders(dataframe :pd.DataFrame):
transformer = make_column_transformer(
(OneHotEncoder(), ["EducationField", "JobRole", "MaritalStatus", "BusinessTravel", "Department"]),
remainder="passthrough",
)
transformed = transformer.fit_transform(dataframe)
# transformed_df = pd.DataFrame(transformed)
transformed_df:pd.DataFrame = pd.DataFrame.sparse.from_spmatrix(transformed)
transformed_df.columns = transformer.get_feature_names_out()
return transformer, transformed_df
def train_predicting_model():
input_df = data_df_no_na.copy()
# only select features
final_X = input_df[selected_features]
final_y = input_df["Attrition"]
transformer, final_X = create_encoders(final_X)
# train
final_model = XGBClassifier()
final_model.fit(final_X, final_y)
return transformer, final_model
# return final_df
final_transformer, final_model = train_predicting_model()
def get_prediction(
education_field:str,
job_role:str,
marital_status:str,
total_working_years:str,
environment_satisfaction:str,
years_since_last_promotion:str,
business_travel:str,
department:str,
):
# create a dataframe
input_df = pd.DataFrame(
{
"EducationField": [education_field],
"JobRole": [job_role],
"MaritalStatus": [marital_status],
"TotalWorkingYears": [total_working_years],
"EnvironmentSatisfaction": [environment_satisfaction],
"YearsSinceLastPromotion": [years_since_last_promotion],
"BusinessTravel": [business_travel],
"Department": [department],
}
)
# transform
transformed = final_transformer.transform(input_df)
# transformed_df = pd.DataFrame(transformed)
transformed_df:pd.DataFrame = pd.DataFrame.sparse.from_spmatrix(transformed)
transformed_df.columns = final_transformer.get_feature_names_out()
# predict
y_pred = final_model.predict(transformed_df)
return y_pred[0]
row = data_df_no_na.sample()[selected_features]
pred = get_prediction(
education_field=row["EducationField"].values[0],
job_role=row["JobRole"].values[0],
marital_status=row["MaritalStatus"].values[0],
total_working_years=row["TotalWorkingYears"].values[0],
environment_satisfaction=row["EnvironmentSatisfaction"].values[0],
years_since_last_promotion=row["YearsSinceLastPromotion"].values[0],
business_travel=row["BusinessTravel"].values[0],
department=row["Department"].values[0],
)
print(pred)
0